## T Tang - Replication of Mouw & Kalleberg 2010

########Data Load#############
#load package: foreign
#load data
data = read.dta("C://R/Replication/Data/kocc99_occ80.dta")
#dim(data) 306077     50

#get rid of data rows that have NA wage
data = data[!is.na(data$lnwage_impute1), ]
#282938     50

##############################
#Part 0: Data preparation
##############################

#get wages
lnwage = data$lnwage_impute1
n = length(lnwage)

#collect independent variables 
constant = rep(1, n, 1)
age = data$age
educ = data$grade92
race = data$race
union = data$unionmme
#turn union into a 0-1 dummy 
union[union == 2] = 0

#fill in dummy matrices (min & max age are 16 to 65, drop dummies that are all 0)
agebucket = ceiling(age/5)
agebucket = agebucket - min(agebucket) + 1
educbucket = educ%%30
educbucket = educbucket - min(educbucket) + 1
age_dum = matrix(0, n, max(agebucket))
educ_dum = matrix(0, n, max(educbucket))
race_dum = matrix (0, n, max(race)) 

for (i in 1:n){
	age_dum[i, agebucket[i]] = 1
	educ_dum[i, educbucket[i]] = 1
	race_dum[i, race[i]] = 1
}

#put everything together (omit constant on X; add this later)
X = cbind(age_dum[,c(1:(ncol(age_dum) - 1))], educ_dum[, c(1:(ncol(educ_dum) - 1))], 
race_dum[,c(1:(ncol(race_dum) - 1))], union)
#dim(X) 282938     28

#compute averages by occupation 
avgY = aggregate(lnwage, list(occ = as.numeric(data$occ80)), mean)
avgX = aggregate(X, list(occ = as.numeric(data$occ80)), mean) #dim 488  32
numocc = as.numeric(data$occ80)
k = nrow(avgY)

#assign avg vectors 
Ya = rep(0, n)
Xa = matrix(0, n, ncol(avgX))
for (i in 1:k){
	#assign average
	index = which(numocc == avgY$occ[i])
	Ya[index] = avgY$x[i]
	for (j in 1:dim(avgX)[2]){
		Xa[index, j] = avgX[i, j]
	}
}
Xa = Xa[,c(2:ncol(Xa))]
#dim(Xa)   282938     28

##################################################
#Part 1: Variance decomposition (Replication)
##################################################

#compute differences
wagediff = data.matrix(lnwage - Ya)
Xdiff = data.matrix(X - Xa)

#run robust regression of wagediff on Xdiff to get beta estimates with white heteroskedastic errors
#R doesn't have a built in package like Stata for heteroskedastic robust std errors. 
#Since estimated coefficients are the same between heteroskedastic robust & homoskedastic OLS, 
#and std errors are not reported by the authors, I will use OLS to get coefficients.

result_OLS = lm(wagediff~Xdiff)
#############################################################################
#OLS Coefficients:

#(Intercept)	       Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
#  8.708e-16   -3.194e-01   -2.553e-01   -1.588e-01   -6.892e-02   -8.729e-03  
#      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
#  1.101e-02    3.597e-02    5.692e-02    3.774e-02   -7.941e-01   -8.129e-01  
#      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
# -8.125e-01   -7.826e-01   -7.675e-01   -7.452e-01   -7.352e-01   -7.387e-01  
#      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
# -6.320e-01   -5.618e-01   -5.447e-01   -5.124e-01   -3.460e-01   -1.511e-01  
#      Xdiff        Xdiff        Xdiff        Xdiff   Xdiffunion  
# -4.644e-02    3.030e-02   -3.543e-02   -8.150e-03    2.370e-01  
#############################################################################
#############################################################################
#For comparison: outlier robust linear regression (RLM)
#result_rlm = rlm(wagediff~Xdiff)
#Converged in 4 iterations

#Coefficients:
# (Intercept)        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff 
#-0.002025455 -0.319886396 -0.255114574 -0.159196504 -0.069502764 -0.011198150 
#       Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff 
# 0.009775201  0.034095579  0.054294142  0.035119862 -0.805806403 -0.818911164 
#       Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff 
#-0.815545382 -0.783669753 -0.770708884 -0.747986008 -0.736666756 -0.740794474 
#       Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff 
#-0.634340524 -0.564935566 -0.548416070 -0.514420758 -0.350602169 -0.152422893 
#       Xdiff        Xdiff        Xdiff        Xdiff   Xdiffunion 
#-0.051649122  0.026884519 -0.037283125 -0.017165033  0.238070996 

#############################################################################

#############################################################################
#mean fixed effects
beta = data.matrix(result_OLS$coeff)
alphas = Ya - cbind(constant, Xa)%*%beta
#############################################################################

#Get delta estimates
#compute prediction errors
esq = (lnwage - cbind(constant, X)%*%beta - alphas)^2
#compute average prediction errors by occupation  
esq_occ = aggregate(esq, list(occ = as.numeric(data$occ80)), mean)
#assign avg vectors  
esqa = rep(0, nrow(esq_occ))

for (i in 1:nrow(esq_occ)){
	#assign average
	index = which(numocc == esq_occ$occ[i])
	esqa[index] = esq_occ$V1[i]
}
#compute differences
esqdiff = esq - esqa
result2_OLS = lm(esqdiff~Xdiff)


#############################################################################
#OLS Coefficients:
(Intercept)        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
  3.216e-17   -8.124e-02   -7.089e-02   -6.132e-02   -4.790e-02   -3.885e-02  
      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
 -3.045e-02   -3.067e-02   -2.088e-02   -1.480e-02   -6.344e-02   -7.780e-02  
      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
 -7.436e-02   -7.227e-02   -6.352e-02   -6.909e-02   -6.793e-02   -6.107e-02  
      Xdiff        Xdiff        Xdiff        Xdiff        Xdiff        Xdiff  
 -5.995e-02   -4.533e-02   -5.289e-02   -5.953e-02   -1.198e-02    1.597e-04  
      Xdiff        Xdiff        Xdiff        Xdiff   Xdiffunion  
  8.032e-02   -1.000e-02   -8.959e-03    5.529e-03   -1.092e-02  
#############################################################################

#############################################################################
#variance fixed effects
psi = data.matrix(result2_OLS$coeff)
deltas = esqa -  cbind(constant, Xa)%*%psi
#############################################################################

#############################################################################
#Compute variance breakdown
mean_alpha = mean(alphas)
mualphas = rep(mean_alpha, n)
between = sum((alphas - mualphas)^2)/n
within = sum(deltas)/n
totalvar =  between + within 

#between
# 0.06150652
# within
# 0.2514606
# totalvar
# 0.3129671

#############################################################################

##################################################
#Part 2: Summary statistics and robustness checks
##################################################
minwage = min(lnwage)
#0.2715286
maxwage = max(lnwage)
#5.929458
countocc = table(numocc)
mincount = min(countocc)
#1
maxcount = max(countocc)
#17533
minavg = min(avgY[, 2])
# 1.688173
maxavg = max(avgY[, 2])
#3.894446

#variance by occupation
var_occ = aggregate(lnwage, list(occ = as.numeric(data$occ80)), var)
min(var_occ)
max(var_occ)



